The purpose of this project is to check the performance of the machine learning algorithms and ensemble methods in predicting defaults in the lending club data.
First we need to start by loading the data.
The variable “loan_status” contains information as to whether the loan has been repaid or charged off (i.e., defaulted). Let’s create a binary factor variable for this. This variable will be the focus of this project.
#let's clean the data
lc_clean <- lc_raw %>%
dplyr::select(-x20:-x80) %>% #delete empty columns
filter(!is.na(int_rate)) %>% #delete empty rows
mutate(
issue_d = mdy(issue_d), # lubridate::mdy() to fix date format
term = factor(term_months), # turn 'term' into a categorical variable
delinq_2yrs = factor(delinq_2yrs) # turn 'delinq_2yrs' into a categorical variable
) %>%
mutate(default = dplyr::recode(loan_status,
"Charged Off" = "1",
"Fully Paid" = "0"))%>%
mutate(default = as.factor(default)) %>%
dplyr::select(-emp_title,-installment, -term_months, everything()) #move some not-so-important variables to the end.
glimpse(lc_clean)## Observations: 37,869
## Variables: 21
## $ int_rate <dbl> 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0…
## $ loan_amnt <dbl> 8000, 6000, 6500, 8000, 5500, 6000, 10200, 15000,…
## $ dti <dbl> 2.11, 5.73, 17.68, 22.71, 5.75, 21.92, 18.62, 9.7…
## $ delinq_2yrs <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ annual_inc <dbl> 50000, 52800, 35352, 79200, 240000, 89000, 110000…
## $ grade <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",…
## $ emp_length <chr> "5 years", "< 1 year", "n/a", "n/a", "10+ years",…
## $ home_ownership <chr> "MORTGAGE", "MORTGAGE", "MORTGAGE", "MORTGAGE", "…
## $ verification_status <chr> "Verified", "Source Verified", "Not Verified", "V…
## $ issue_d <date> 2011-09-01, 2011-09-01, 2011-09-01, 2011-09-01, …
## $ zip_code <chr> "977xx", "228xx", "864xx", "322xx", "278xx", "760…
## $ addr_state <chr> "OR", "VA", "AZ", "FL", "NC", "TX", "CT", "WA", "…
## $ loan_status <chr> "Fully Paid", "Charged Off", "Fully Paid", "Fully…
## $ desc <chr> "Borrower added on 09/08/11 > Consolidating debt …
## $ purpose <chr> "debt_consolidation", "vacation", "credit_card", …
## $ title <chr> "Credit Card Payoff $8K", "bad choice", "Credit C…
## $ term <fct> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 3…
## $ default <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ term_months <dbl> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 3…
## $ installment <dbl> 241.28, 180.96, 196.04, 241.28, 165.88, 180.96, 3…
## $ emp_title <chr> NA, "coral graphics", NA, "Honeywell", "O T Plus,…
Reducing the category in some of the variables for simplicity.
#Examine the final data frame
#Choose a subset of variables for classification
data_subset<-lc_clean%>%select(-c(issue_d,zip_code,addr_state,desc,purpose,title,installment,emp_title,term_months,loan_status))
glimpse(data_subset)## Observations: 37,869
## Variables: 11
## $ int_rate <dbl> 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0…
## $ loan_amnt <dbl> 8000, 6000, 6500, 8000, 5500, 6000, 10200, 15000,…
## $ dti <dbl> 2.11, 5.73, 17.68, 22.71, 5.75, 21.92, 18.62, 9.7…
## $ delinq_2yrs <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ annual_inc <dbl> 50000, 52800, 35352, 79200, 240000, 89000, 110000…
## $ grade <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",…
## $ emp_length <chr> "5 years", "< 1 year", "n/a", "n/a", "10+ years",…
## $ home_ownership <chr> "MORTGAGE", "MORTGAGE", "MORTGAGE", "MORTGAGE", "…
## $ verification_status <chr> "Verified", "Source Verified", "Not Verified", "V…
## $ term <fct> 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 3…
## $ default <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#Reduce the categories for delinquincies. Check the results of the following code
data_subset<-data_subset%>%mutate(delinq_2yrs=ifelse(as.character(delinq_2yrs) %in% c("0","1","2"),as.character(delinq_2yrs),3))
data_subset$delinq_2yrs<-factor(data_subset$delinq_2yrs,labels=c("0","1","2","3"))
#Reduce the categories for employment length
data_subset<-data_subset%>%mutate(emp_length=ifelse(as.character(emp_length) %in% c("< 1 year","1 year","2 years","3 years","4 years","n/a"),as.character(emp_length),"More_than_5"))
#Make sure emp_length is a factor
data_subset$emp_length<-factor(data_subset$emp_length,labels=c("less_than1","1_year","2_years","3_years","4_years","More_than_5","No_Info"))
table(data_subset$emp_length)##
## less_than1 1_year 2_years 3_years 4_years More_than_5
## 4384 3103 4168 3910 3305 17978
## No_Info
## 1021
#Make sure default is a factor
levels(data_subset$default) <- make.names(levels(factor(data_subset$default)))
data_subset$default <- factor(data_subset$default, labels = c("No", "Yes"))
#Make sure grade and home_ownership are factors
data_subset$grade<-factor(data_subset$grade)
#Reduce the categories for home_ownership
data_subset<-data_subset%>%mutate(home_ownership=ifelse(as.character(home_ownership) %in% c("MORTGAGE","OWN","RENT"),as.character(home_ownership),"OTHER"))
data_subset$home_ownership<-factor(data_subset$home_ownership)
table(data_subset$home_ownership)##
## MORTGAGE OTHER OWN RENT
## 16751 91 2891 18136
#Reduce the categories for verification_status
data_subset$verification_status <- factor(data_subset$verification_status)
nv<-levels(data_subset$verification_status)[2]
data_subset<-data_subset%>%mutate(verification_status=ifelse(as.character(verification_status)=="Source Verified","Verified",as.character(verification_status)))
data_subset<-data_subset%>%mutate(verification_status=ifelse(as.character(verification_status)==nv,"Not_Verified",as.character(verification_status)))
data_subset<-data_subset%>%mutate(verification_status=ifelse(as.character(verification_status)=="0","No_Info",as.character(verification_status)))
data_subset$verification_status <- factor(data_subset$verification_status)
table(data_subset$verification_status)##
## Not Verified Verified
## 16075 21794
## # A tibble: 6 x 11
## int_rate loan_amnt dti delinq_2yrs annual_inc grade emp_length
## <dbl> <dbl> <dbl> <fct> <dbl> <fct> <fct>
## 1 0.05 8000 2.11 0 50000 A More_than…
## 2 0.05 6000 5.73 0 52800 A less_than1
## 3 0.05 6500 17.7 0 35352 A No_Info
## 4 0.05 8000 22.7 0 79200 A No_Info
## 5 0.05 5500 5.75 0 240000 A More_than…
## 6 0.05 6000 21.9 0 89000 A 2_years
## # … with 4 more variables: home_ownership <fct>, verification_status <fct>,
## # term <fct>, default <fct>
Generate training and test data sets.
library(rsample)
set.seed(100)
train_test_split <- initial_split(data_subset, prop = 0.75) #training set contains 75% of the data
train <- training(train_test_split)
test <- testing(train_test_split)
combined_train<-train
combined_test<-test
head(combined_train)## # A tibble: 6 x 11
## int_rate loan_amnt dti delinq_2yrs annual_inc grade emp_length
## <dbl> <dbl> <dbl> <fct> <dbl> <fct> <fct>
## 1 0.05 8000 2.11 0 50000 A More_than…
## 2 0.05 6000 5.73 0 52800 A less_than1
## 3 0.05 8000 22.7 0 79200 A No_Info
## 4 0.05 5500 5.75 0 240000 A More_than…
## 5 0.05 6000 21.9 0 89000 A 2_years
## 6 0.05 10200 18.6 0 110000 A More_than…
## # … with 4 more variables: home_ownership <fct>, verification_status <fct>,
## # term <fct>, default <fct>
When we plot both of in-sample and out-of-sample prediction ROC curve, we can see that AUC(Area under the curve) for training dataset is 68.46% and AUC for testing dataset is 68.11%. Therefore, we can see that for out-of-sample prediction, the accuracy decreases. To improve this out-of-sample prediction performance, we conduct out-of-sample validation and three-way data partitioning. Three-way data partitioning is composed of 3 levels; training, validation, testing. Three-way data partitioning has a problem that it divides the dataset into 3 section and thus can utilise less amount of data to estimate the model. However, this problem can be overcome by using K-fold cross validation.
The sum of estimated coefficients. This regularization improves the out-of-sample prediction performance because it choose ‘lambda’ to maximise the performance of out-of-sample through K-fold cross validation.
#I will use caret to fit a logistic regression model
train_control <- trainControl(method="cv", number=2, classProbs=TRUE, #cv = cross validation / number is the number of folds in the cross validation (I fit the model in one of them and use the other to check them and change the roles after that) / as the number increases the speed decreases.
summaryFunction=twoClassSummary,verboseIter = TRUE)
#Fit a logistic regression model
logistic2<- train(default~., data = combined_train, preProcess = c("center", "scale"),
method="glm", family="binomial", metric="ROC", trControl=train_control)#glm = logistic regression## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## Aggregating results
## Fitting final model on full training set
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.6793063 0.999671 0.002203722 0.002725625 0.00023262 0.0003462815
## Generalized Linear Model
##
## 28402 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (24), scaled (24)
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 14201, 14201
## Resampling results:
##
## ROC Sens Spec
## 0.6793063 0.999671 0.002203722
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3339 -0.6039 -0.4766 -0.3361 4.1306
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.949569 0.019579 -99.574 < 2e-16 ***
## int_rate 0.355409 0.061217 5.806 6.41e-09 ***
## loan_amnt 0.022037 0.021010 1.049 0.294225
## dti 0.017375 0.017988 0.966 0.334102
## delinq_2yrs1 -0.037960 0.017356 -2.187 0.028734 *
## delinq_2yrs2 0.025900 0.015251 1.698 0.089464 .
## delinq_2yrs3 -0.011393 0.016135 -0.706 0.480124
## annual_inc -0.308850 0.029801 -10.364 < 2e-16 ***
## gradeB 0.156654 0.038361 4.084 4.43e-05 ***
## gradeC 0.199011 0.046591 4.271 1.94e-05 ***
## gradeD 0.176313 0.051160 3.446 0.000568 ***
## gradeE 0.128179 0.046771 2.741 0.006134 **
## gradeF 0.085660 0.035736 2.397 0.016531 *
## gradeG 0.047385 0.024093 1.967 0.049211 *
## emp_length1_year -0.008167 0.022005 -0.371 0.710526
## emp_length2_years -0.028245 0.023352 -1.210 0.226457
## emp_length3_years -0.026129 0.023090 -1.132 0.257787
## emp_length4_years -0.028647 0.022426 -1.277 0.201458
## emp_lengthMore_than_5 0.036335 0.029181 1.245 0.213070
## emp_lengthNo_Info 0.103044 0.017109 6.023 1.71e-09 ***
## home_ownershipOTHER 0.029787 0.015545 1.916 0.055346 .
## home_ownershipOWN 0.010019 0.018278 0.548 0.583575
## home_ownershipRENT 0.024112 0.019786 1.219 0.222963
## verification_statusVerified -0.031793 0.019204 -1.656 0.097822 .
## term60 0.195236 0.019041 10.253 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23391 on 28401 degrees of freedom
## Residual deviance: 21906 on 28377 degrees of freedom
## AIC: 21956
##
## Number of Fisher Scoring iterations: 5
Predict the probabilities for the test data.
# Next I predict the probabilities and then plot the ROC curve.
watched_prob <- predict(logistic2, combined_test, type = "prob")[,2]
watched_prob2 <- predict(logistic2, combined_train, type = "prob")[,2]#why 2? cuz we're gonna look at the probability of the default ones
library(pROC)
ROC_tr <- roc(combined_train$default, watched_prob2)## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Calculate the area under the curve (AUC)
AUC_lr <- round(auc(ROC_lr)*100, digits=2)
AUC_tr <- round(auc(ROC_tr)*100, digits=2)
# Plot the ROC curves
plot(ROC_lr, col = "blue",main=paste("LogReg Lending-Out of Sample Data AUC=",AUC_lr,"%",sep = ""))Plot the estimated probabilities for default and non-defaults
ggplot( combined_test, aes( watched_prob, color = as.factor(default) ) ) +
geom_density( size = 1 ) +
ggtitle( "Test Data Set's Predicted Score_pdf" ) +
xlab("Estimated Probability")ggplot( combined_test, aes( watched_prob, color = as.factor(default) ) ) +
stat_ecdf( size = 1 ) +
ggtitle( "Test Data Set's Predicted Score_cdf" ) +
xlab("Estimated Probability")#cdf: 75% of them is below probability 0.2
#For who are default, when the probability of default is 0.2, the y is around 0.625
#pdf: basically a histogram - the probability of being default is shown as a count (density)Basically, when the ROC is very high, there is a neat separation between the two lines representing yes and no in the cdf, on the other hand when ROC is equal to 50% (random model) no such distinction is visible. In other words, when the two CDF lines are closer, the ROC curve becomes closer to the neutral line since it means that there are not much differences between two outcomes from logistic regression.
Additionally, Cumulative Distribution Function(CDF) plot shows the cumulative distribution of estimated probability for prediction of default. We are able to see that for the prediction of not being default achieves value 1 of ‘y’ more left than prediction for default. On the other hand, the Receiver and Operating Characteristic(ROC) curve is a scatterplot of the model’s sensitivity (True Positive rate) against the specificity (True Negative rate) for different cutoff values.
Fit a K-NN model and choose k that gives the best AUC performance.
# Below I use 'train' function from caret library.
#Let's look at the list of tunable parameters
modelLookup("knn") #I can only control K## model parameter label forReg forClass probModel
## 1 knn k #Neighbors TRUE TRUE TRUE
#Let's set the search grid first
knnGrid <- expand.grid(k = seq(100,300, by = 50)) #it will check when both k = 4, 5 and will return the better value
#I will use AUC to choose the best k, hence we need the class probabilities.
control <- train_control
# By fixing the see I can re-generate the results when needed
set.seed(100)
# 'preProcess': I use this option to center and scale the data
# 'method' is knn
# 'metric' is ROC or AUC
# I already defined the 'trControl' and 'tuneGrid' options above
fit_KNN <- train(default~., data=combined_train,preProcess = c("center", "scale"), #with KNN we always center and Scale
method="knn", metric="ROC", trControl=control,tuneGrid = knnGrid)## + Fold1: k=100
## - Fold1: k=100
## + Fold1: k=150
## - Fold1: k=150
## + Fold1: k=200
## - Fold1: k=200
## + Fold1: k=250
## - Fold1: k=250
## + Fold1: k=300
## - Fold1: k=300
## + Fold2: k=100
## - Fold2: k=100
## + Fold2: k=150
## - Fold2: k=150
## + Fold2: k=200
## - Fold2: k=200
## + Fold2: k=250
## - Fold2: k=250
## + Fold2: k=300
## - Fold2: k=300
## Aggregating results
## Selecting tuning parameters
## Fitting k = 200 on full training set
## k-Nearest Neighbors
##
## 28402 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (24), scaled (24)
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 14201, 14201
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 100 0.6612913 1 0
## 150 0.6651249 1 0
## 200 0.6656383 1 0
## 250 0.6650190 1 0
## 300 0.6647664 1 0
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 200.
k_nn_probs<-predict(fit_KNN, newdata = combined_test,type = "prob")[,2]
# Let's find the ROC values using 'roc' function from pROC.
ROC_knn <- roc(combined_test$default, k_nn_probs)## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Let's find AUC using the 'auc' function and round it for ease of notation.
AUC_knn<-round(ROC_knn$auc*100, digits=2)Compare the out of sample performance of K-NN with logistic regression by plotting the ROC curves for both in the same graph. Use ggroc package as I demonstrate in the rmd file from Session 3.
g2 <- ggroc(list("Logistic Regression"=ROC_lr, "K-NN"=ROC_knn))
g2+ggtitle(paste("AUC Logistic Regression =",AUC_lr,"%"," vs AUC KNN=",AUC_knn,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")+
theme_light()+
labs(
x = "Specificity",
y = "Sensitivity",
linetype = "Model Type"
)Fit a tree the function rpart. Plot the resulting tree. Compare the results with logistic regression. Finally check the out of sample performance of the model.
Based on the tree visualisation, the three variables - grade, term, annual income seems the most important variables to determine the dependent variables. However, when I run varImp function to calculate the overall importance of variables, it gives a bit different results. When using varImp function, it shows that the three most important variables are int_rate, grade, term. This happens because the variable importance calculation is conducted in much more complicated way then the tree splits. The variable importance is measured by the sum of the goodness of split measures for each split for which it was the primary variable + goodness * (adjusted agreement) for all splits in which it was a surrogate. (Source: An introduction to Recursive Partitioning Using the RPART Routines)
Seeing the ROC plots of logistic regression and the tree model, we can see that logistic regression is generally working better than tree model with this dataset. Thus, there is no reason to use tree model over logistic regression in this case.
control=rpart.control(cp = 0, maxdepth = 10, minbucket=50, minsplit=2)
LC_treeModel <- rpart(default~., data=train, method = "class",control =control)
rpart.plot(LC_treeModel)## Overall
## annual_inc 80.859091
## dti 7.099870
## emp_length 15.608245
## grade 232.662456
## home_ownership 4.492477
## int_rate 258.186296
## loan_amnt 37.812541
## term 213.252828
## delinq_2yrs 0.000000
## verification_status 0.000000
Find the estimated probabilites in the test data set. Then plot the ROC’s and estimated probabilities
defprob_trees<-predict(LC_treeModel, newdata = combined_test,type = "prob")[,2]
ROC_tree <- roc(combined_test$default, defprob_trees)## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
AUC_tree <- round(ROC_tree$auc*100, digits=2)
#Plot ROC's
g2 <- ggroc(list("Logistic Regression"=ROC_lr,"Trees"=ROC_tree))
g2+ggtitle(paste("Logistic Regression=",AUC_lr,"%"," vs Tree=",AUC_tree,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")#Plot the estimated probabilities for default and non-defaults
ggplot( combined_test, aes( defprob_trees, color = as.factor(default) ) ) +
geom_density( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")ggplot( combined_test, aes( defprob_trees, color = as.factor(default) ) ) +
stat_ecdf( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")Use caret package to tune your tree using the complexity parameter cp. Check the accuracy of your fitted model.
#However, expand the search grid below after you run it with basic parameters
modelLookup("rpart") #cp = complexity## model parameter label forReg forClass probModel
## 1 rpart cp Complexity Parameter TRUE TRUE TRUE
#Expand the search grid after you run it with basic parameters
Grid <- expand.grid(cp = seq(0.0000,0.0002,by=0.00001))
dtree_fit <- train(default~., data=train,
method = "rpart",
metric="ROC",
trControl=train_control,
control=rpart.control(minbucket = 25),
tuneGrid=Grid) ## + Fold1: cp=0
## - Fold1: cp=0
## + Fold2: cp=0
## - Fold2: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 2e-04 on full training set
## CART
##
## 28402 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 14201, 14201
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.00000 0.6075141 0.9856896 0.03746327
## 0.00001 0.6075141 0.9856896 0.03746327
## 0.00002 0.6075141 0.9856896 0.03746327
## 0.00003 0.6075141 0.9856896 0.03746327
## 0.00004 0.6075141 0.9856896 0.03746327
## 0.00005 0.6075141 0.9856896 0.03746327
## 0.00006 0.6075141 0.9856896 0.03746327
## 0.00007 0.6075141 0.9856896 0.03746327
## 0.00008 0.6075141 0.9856896 0.03746327
## 0.00009 0.6075141 0.9856896 0.03746327
## 0.00010 0.6075141 0.9856896 0.03746327
## 0.00011 0.6075141 0.9856896 0.03746327
## 0.00012 0.6075141 0.9856896 0.03746327
## 0.00013 0.6075141 0.9856896 0.03746327
## 0.00014 0.6075141 0.9856896 0.03746327
## 0.00015 0.6075141 0.9856896 0.03746327
## 0.00016 0.6075141 0.9856896 0.03746327
## 0.00017 0.6079714 0.9866765 0.03599412
## 0.00018 0.6079714 0.9866765 0.03599412
## 0.00019 0.6079714 0.9866765 0.03599412
## 0.00020 0.6079714 0.9866765 0.03599412
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 2e-04.
defprob_trees<-predict(dtree_fit, newdata = combined_test,type = "prob")[,2]
ROC_tree <- roc(combined_test$default, defprob_trees)## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
AUC_tree<-round(ROC_tree$auc*100, digits=2)
g2 <- ggroc(list("Log Reg"=ROC_lr,"Trees"=ROC_tree))
g2+ggtitle(paste("Log Reg=",AUC_lr,"%"," vs Tree=",AUC_tree,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")Plot the estimated probabilities.
#Plot the estimated probabilities for default and non-defaults
ggplot( combined_test, aes( defprob_trees, color = as.factor(default) ) ) +
geom_density( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")ggplot( combined_test, aes( defprob_trees, color = as.factor(default) ) ) +
stat_ecdf( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")Fit a random forest (RF) using the randomForest function.
From the ROC plot, I observed that the AUC value for Random Forest is much lower than logistic regression. The AUC is almost 10% lower for random forest in this case.
#Examine the results it produces
set.seed(100)
# I will use 'randomForest' library
library(randomForest)## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# 'randomForest' function inherits the parameters of 'rpart' function that control tree growth.
# Additional parameters are
# i) 'ntree': number of trees in the forest
# ii) 'mtry': number of randomy chosen variables to do a split each time
# iii) 'importance': is an option to get a sense of the importance of the variables. We will use it below.
RForest_BBC_Model <- randomForest(default~., data=train,
ntree = 200,mtry=3,maxnodes=50,nodesize=2)
# Print the model output
print(RForest_BBC_Model)##
## Call:
## randomForest(formula = default ~ ., data = train, ntree = 200, mtry = 3, maxnodes = 50, nodesize = 2)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 14.38%
## Confusion matrix:
## No Yes class.error
## No 24318 0 0
## Yes 4084 0 1
# Find the predictions in the test data
defprob_RF<-predict(RForest_BBC_Model, newdata = combined_test,type = "prob")[,2]
ROC_RF <- roc(combined_test$default, defprob_RF)## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
AUC_RF<-round(ROC_RF$auc*100, digits=2)
##PLot the roc's
g2 <- ggroc(list("Log Reg"=ROC_lr,"Random Forest"=ROC_RF))
g2+ggtitle(paste("Log Reg=",AUC_lr,"%"," vs RF=",AUC_RF,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")Fit a RF and hypertune the number of features the algorithm chooses in each step (variable mtry) using the caret package and ranger method. Choose a few values as otherwise this may take a very long time to run.
Firstly, I chose Mtry to be 3 because it is the most close value of the square root of 10 (the number of explanatory variables). And then, I tried different numbers to check which mtry makes the highest accuracy, and I ended up with number 2. When Mtry, which is the number of the variables random forest model randomly chooses to split the branches, is 2, the model works the best.
We can further tune improve the performance of the model by choosing different variables or feature engineering and tuning the hyperparameters of the algorithm (e.g. different split rules). Other than that, we can change the minimum node size as well. By playing around with all the parameters and variables I could get the AUC of 68.51% which is 0.40% higher than logistic regression.
By looking at the variable importance using varImp() function, we can see that 1) interest rate, 2) term 60, 3) gradeD are the most important variables for random forest model to decide the branches and explain the most of the dependent variable.
## model parameter label forReg forClass probModel
## 1 ranger mtry #Randomly Selected Predictors TRUE TRUE TRUE
## 2 ranger splitrule Splitting Rule TRUE TRUE TRUE
## 3 ranger min.node.size Minimal Node Size TRUE TRUE TRUE
# Define the tuning grid: tuneGrid
gridRF <- data.frame(
.mtry = 2,
.splitrule = "gini",
.min.node.size = c(5:10)
)
# Fit random forest: model
rf_RF <- train(
default ~ annual_inc + term + grade + loan_amnt + int_rate, data = train,
method = "ranger",
metric = "ROC",
trControl = train_control,
tuneGrid = gridRF,
importance = 'permutation'
)## + Fold1: mtry=2, splitrule=gini, min.node.size= 5
## - Fold1: mtry=2, splitrule=gini, min.node.size= 5
## + Fold1: mtry=2, splitrule=gini, min.node.size= 6
## - Fold1: mtry=2, splitrule=gini, min.node.size= 6
## + Fold1: mtry=2, splitrule=gini, min.node.size= 7
## - Fold1: mtry=2, splitrule=gini, min.node.size= 7
## + Fold1: mtry=2, splitrule=gini, min.node.size= 8
## - Fold1: mtry=2, splitrule=gini, min.node.size= 8
## + Fold1: mtry=2, splitrule=gini, min.node.size= 9
## - Fold1: mtry=2, splitrule=gini, min.node.size= 9
## + Fold1: mtry=2, splitrule=gini, min.node.size=10
## - Fold1: mtry=2, splitrule=gini, min.node.size=10
## + Fold2: mtry=2, splitrule=gini, min.node.size= 5
## - Fold2: mtry=2, splitrule=gini, min.node.size= 5
## + Fold2: mtry=2, splitrule=gini, min.node.size= 6
## - Fold2: mtry=2, splitrule=gini, min.node.size= 6
## + Fold2: mtry=2, splitrule=gini, min.node.size= 7
## - Fold2: mtry=2, splitrule=gini, min.node.size= 7
## + Fold2: mtry=2, splitrule=gini, min.node.size= 8
## - Fold2: mtry=2, splitrule=gini, min.node.size= 8
## + Fold2: mtry=2, splitrule=gini, min.node.size= 9
## - Fold2: mtry=2, splitrule=gini, min.node.size= 9
## + Fold2: mtry=2, splitrule=gini, min.node.size=10
## - Fold2: mtry=2, splitrule=gini, min.node.size=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = gini, min.node.size = 9 on full training set
## Random Forest
##
## 28402 samples
## 5 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 14201, 14201
## Resampling results across tuning parameters:
##
## min.node.size ROC Sens Spec
## 5 0.6805035 0.9999589 0.0007345739
## 6 0.6806124 1.0000000 0.0009794319
## 7 0.6805427 0.9999178 0.0012242899
## 8 0.6805716 0.9999589 0.0012242899
## 9 0.6807850 0.9999178 0.0012242899
## 10 0.6801889 1.0000000 0.0009794319
##
## Tuning parameter 'mtry' was held constant at a value of 2
## Tuning
## parameter 'splitrule' was held constant at a value of gini
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = gini
## and min.node.size = 9.
## [1] "label" "library" "check" "loop" "type"
## [6] "parameters" "grid" "fit" "predict" "prob"
## [11] "predictors" "varImp" "levels" "oob" "tags"
## [16] "sort"
Examine the predictive performance of the best RF model using the test data set.
defprob_RF <- predict(rf_RF,combined_test, type = "prob")[,2]
ROC_forest <- roc(combined_test$default, defprob_RF)
AUC_forest=round(ROC_forest$auc*100, digits=2)
# Plot the ROC curve
g2 <- ggroc(list("Log Reg"=ROC_lr, "RF"=ROC_forest))
g2+ggtitle(paste("Log Reg=",AUC_lr,"%"," vs RF=",AUC_forest,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")##Tip: correlation is not a issue in random forest because it's not related with P-values
## and the errors are not related to this modelPlot the estimated probabilities.
#Plot the estimated probabilities for default and non-defaults
ggplot( combined_test, aes( defprob_RF, color = as.factor(default) ) ) +
geom_density( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")ggplot( combined_test, aes( defprob_RF, color = as.factor(default) ) ) +
stat_ecdf( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")Fit a GBM model using Caret and hypertune its parameters. Check variable importance in this model.
Compared to logistic regression GBM has higher ROC but not as much as Random Forest does. GBM shows 68.46% of AUC and it’s 0.35% higher than that of logistic regression.
#However, expand the search grid once you run this initial code
set.seed(100)
ctrl <- train_control
grid<-expand.grid(interaction.depth = seq(1,5, by = 1),n.trees = seq(100,300, by = 100),shrinkage = seq(0.01,0.03, by =0.01), n.minobsinnode = 10)
gbmFit1 <- train(
default~., data=train,
method = "gbm",
trControl = ctrl,
metric = "ROC" ,
preProcess = c("center", "scale"),
tuneGrid=grid,
verbose=FALSE
)## + Fold1: shrinkage=0.01, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.01, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.01, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.01, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.01, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.01, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.01, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.01, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.01, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.01, interaction.depth=5, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.02, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.02, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.02, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.02, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.02, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.02, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.02, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.02, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.02, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.02, interaction.depth=5, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.03, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.03, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.03, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.03, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.03, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.03, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.03, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.03, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold1: shrinkage=0.03, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold1: shrinkage=0.03, interaction.depth=5, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.01, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.01, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.01, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.01, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.01, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.01, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.01, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.01, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.01, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.01, interaction.depth=5, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.02, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.02, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.02, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.02, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.02, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.02, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.02, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.02, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.02, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.02, interaction.depth=5, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.03, interaction.depth=1, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.03, interaction.depth=1, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.03, interaction.depth=2, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.03, interaction.depth=2, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.03, interaction.depth=3, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.03, interaction.depth=3, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.03, interaction.depth=4, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.03, interaction.depth=4, n.minobsinnode=10, n.trees=300
## + Fold2: shrinkage=0.03, interaction.depth=5, n.minobsinnode=10, n.trees=300
## - Fold2: shrinkage=0.03, interaction.depth=5, n.minobsinnode=10, n.trees=300
## Aggregating results
## Selecting tuning parameters
## Fitting n.trees = 300, interaction.depth = 2, shrinkage = 0.03, n.minobsinnode = 10 on full training set
## var rel.inf
## int_rate int_rate 52.15713767
## annual_inc annual_inc 20.11367968
## term60 term60 15.38405640
## loan_amnt loan_amnt 3.88375342
## dti dti 2.70884417
## emp_lengthNo_Info emp_lengthNo_Info 2.18518448
## emp_lengthMore_than_5 emp_lengthMore_than_5 0.71244474
## gradeD gradeD 0.65948805
## gradeC gradeC 0.45300896
## gradeF gradeF 0.33042705
## delinq_2yrs1 delinq_2yrs1 0.28482433
## delinq_2yrs2 delinq_2yrs2 0.21752132
## emp_length3_years emp_length3_years 0.19054337
## gradeG gradeG 0.17978678
## emp_length2_years emp_length2_years 0.13344147
## verification_statusVerified verification_statusVerified 0.07946746
## emp_length4_years emp_length4_years 0.07471584
## home_ownershipOTHER home_ownershipOTHER 0.07300815
## gradeE gradeE 0.06656274
## delinq_2yrs3 delinq_2yrs3 0.03915464
## gradeB gradeB 0.03833765
## home_ownershipOWN home_ownershipOWN 0.03461163
## emp_length1_year emp_length1_year 0.00000000
## home_ownershipRENT home_ownershipRENT 0.00000000
## Stochastic Gradient Boosting
##
## 28402 samples
## 10 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (24), scaled (24)
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 14201, 14201
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 1 100 0.6607750 1.0000000 0.0000000000
## 0.01 1 200 0.6712273 1.0000000 0.0000000000
## 0.01 1 300 0.6759884 1.0000000 0.0000000000
## 0.01 2 100 0.6721585 1.0000000 0.0000000000
## 0.01 2 200 0.6775673 1.0000000 0.0000000000
## 0.01 2 300 0.6801632 1.0000000 0.0000000000
## 0.01 3 100 0.6757399 1.0000000 0.0000000000
## 0.01 3 200 0.6795056 1.0000000 0.0000000000
## 0.01 3 300 0.6813151 1.0000000 0.0000000000
## 0.01 4 100 0.6773980 1.0000000 0.0000000000
## 0.01 4 200 0.6804939 1.0000000 0.0000000000
## 0.01 4 300 0.6816563 0.9999589 0.0007345739
## 0.01 5 100 0.6781840 1.0000000 0.0000000000
## 0.01 5 200 0.6807449 1.0000000 0.0000000000
## 0.01 5 300 0.6819611 0.9999178 0.0004897160
## 0.02 1 100 0.6717004 1.0000000 0.0000000000
## 0.02 1 200 0.6782971 1.0000000 0.0000000000
## 0.02 1 300 0.6802075 0.9998766 0.0002448580
## 0.02 2 100 0.6782101 1.0000000 0.0000000000
## 0.02 2 200 0.6814290 0.9999589 0.0002448580
## 0.02 2 300 0.6828039 0.9995477 0.0019588639
## 0.02 3 100 0.6792758 1.0000000 0.0000000000
## 0.02 3 200 0.6819323 0.9997944 0.0012242899
## 0.02 3 300 0.6830202 0.9992598 0.0041625857
## 0.02 4 100 0.6804681 1.0000000 0.0000000000
## 0.02 4 200 0.6826539 0.9997121 0.0012242899
## 0.02 4 300 0.6829046 0.9992598 0.0029382958
## 0.02 5 100 0.6800882 1.0000000 0.0000000000
## 0.02 5 200 0.6813094 0.9995477 0.0029382958
## 0.02 5 300 0.6817677 0.9988897 0.0048971596
## 0.03 1 100 0.6751455 1.0000000 0.0000000000
## 0.03 1 200 0.6804024 0.9998766 0.0002448580
## 0.03 1 300 0.6814905 0.9996299 0.0009794319
## 0.03 2 100 0.6794854 1.0000000 0.0000000000
## 0.03 2 200 0.6821794 0.9995477 0.0017140059
## 0.03 2 300 0.6831793 0.9990542 0.0036728697
## 0.03 3 100 0.6812012 1.0000000 0.0000000000
## 0.03 3 200 0.6821876 0.9992598 0.0034280118
## 0.03 3 300 0.6821057 0.9986019 0.0061214496
## 0.03 4 100 0.6817125 0.9998766 0.0009794319
## 0.03 4 200 0.6826686 0.9987663 0.0046523017
## 0.03 4 300 0.6824546 0.9981906 0.0061214496
## 0.03 5 100 0.6812935 0.9998355 0.0009794319
## 0.03 5 200 0.6816984 0.9984374 0.0058765916
## 0.03 5 300 0.6808369 0.9975327 0.0097943193
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 300, interaction.depth =
## 2, shrinkage = 0.03 and n.minobsinnode = 10.
Predict probabilities and plot ROC’s
watched_prob_GBM <-predict(gbmFit1,combined_test, type = "prob")[,2]
ROC_GBM <- suppressMessages(roc(combined_test$default, watched_prob_GBM))
AUC_GBM=round(auc(ROC_GBM)*100, digits=2)
g2 <- ggroc(list("Log Reg"=ROC_lr, "RF"=ROC_forest,"GBM"=ROC_GBM))
g2+ggtitle(paste("Log Reg=",AUC_lr,"%"," vs RF=",AUC_forest,"%"," vs GBM=",AUC_GBM,"%",sep="" ))+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed")Plot the estimated probabilities.
#Plot the estimated probabilities for default and non-defaults
ggplot( combined_test, aes( watched_prob_GBM, color = as.factor(default) ) ) +
geom_density( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")ggplot( combined_test, aes( watched_prob_GBM, color = as.factor(default) ) ) +
stat_ecdf( size = 1 ) +
ggtitle( "Test Set's Predicted Score" ) +
xlab("Estimated Probability")My correlation table is as below. The correlation between all the models are high except between knn and gbm but still lower than 1. From this correlation table, we can conclude that most of the models will give us similar results excpet knn and gbm, and by using this result of correlation, we can save our time and computational power.
ranger gbm knn glm
ranger 1.0000000 0.8584990 0.8908491 0.9395568 gbm 0.8584990 1.0000000 0.5370328 0.9095530 knn 0.8908491 0.5370328 1.0000000 0.7581808 glm 0.9395568 0.9095530 0.7581808 1.0000000
Through Ensemble method, I achieved the highest AUC which is 68.64%. The ROC following the stacking method improves by 0.53% compared to simple logistic regression model. I fed my stacking model with the 4 different classification algorithms: logistic regression, KNN, GBM, Ranger(random forest). Most of the time Ensemble method gives the better result because when the correlation between models is less than 1, the impact of the errors is diminished because part of them cancel between each other.
library(caretEnsemble)
my_control <- trainControl(
method="cv",
number=5,
savePredictions="final",
classProbs=TRUE,
summaryFunction=twoClassSummary,
verboseIter = TRUE,
)
model_list <- caretList(
default~., data=train,
trControl=my_control,
metric = "ROC",
methodList=c("glm"),
preProcess = c("center", "scale"),
tuneList=list( ##Change the paramters with the best parameters you found above
ranger=caretModelSpec(method="ranger", tuneGrid=data.frame(mtry=2,splitrule="gini",min.node.size=7)),
gbm=caretModelSpec(method="gbm", tuneGrid=data.frame(interaction.depth = 2,n.trees = 300,shrinkage =0.03, n.minobsinnode = 10),verbose=FALSE),
knn=caretModelSpec(method="knn", tuneGrid=data.frame(k = 200))#add knn to the code
))## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
## + Fold1: mtry=2, splitrule=gini, min.node.size=7
## - Fold1: mtry=2, splitrule=gini, min.node.size=7
## + Fold2: mtry=2, splitrule=gini, min.node.size=7
## - Fold2: mtry=2, splitrule=gini, min.node.size=7
## + Fold3: mtry=2, splitrule=gini, min.node.size=7
## - Fold3: mtry=2, splitrule=gini, min.node.size=7
## + Fold4: mtry=2, splitrule=gini, min.node.size=7
## - Fold4: mtry=2, splitrule=gini, min.node.size=7
## + Fold5: mtry=2, splitrule=gini, min.node.size=7
## - Fold5: mtry=2, splitrule=gini, min.node.size=7
## Aggregating results
## Fitting final model on full training set
## + Fold1: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## - Fold1: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## + Fold2: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## - Fold2: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## + Fold3: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## - Fold3: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## + Fold4: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## - Fold4: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## + Fold5: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## - Fold5: interaction.depth=2, n.trees=300, shrinkage=0.03, n.minobsinnode=10
## Aggregating results
## Fitting final model on full training set
## + Fold1: k=200
## - Fold1: k=200
## + Fold2: k=200
## - Fold2: k=200
## + Fold3: k=200
## - Fold3: k=200
## + Fold4: k=200
## - Fold4: k=200
## + Fold5: k=200
## - Fold5: k=200
## Aggregating results
## Fitting final model on full training set
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
## Length Class Mode
## ranger 24 train list
## gbm 24 train list
## knn 24 train list
## glm 24 train list
## ranger gbm knn glm
## ranger 1.0000000 0.9477972 0.8057220 0.9388746
## gbm 0.9477972 1.0000000 0.5875943 0.7870013
## knn 0.8057220 0.5875943 1.0000000 0.9244965
## glm 0.9388746 0.7870013 0.9244965 1.0000000
Implement stacking and estimate plot the ROC of the resulting model for the test data.
#See the instructions at the end
glm_ensemble <- caretStack(
model_list,
method="glm",
metric="ROC",
trControl=trainControl(
method="cv",
number=2,
savePredictions="final",
classProbs=TRUE,
summaryFunction=twoClassSummary
)
)
#Check the summary of the results
summary(glm_ensemble) ##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7725 -0.5656 -0.4592 -0.3841 2.3766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.12658 0.05237 -59.699 < 2e-16 ***
## ranger 3.78280 0.98347 3.846 0.000120 ***
## gbm 2.55005 0.72596 3.513 0.000444 ***
## knn 0.48062 0.53247 0.903 0.366725
## glm 1.65046 0.70857 2.329 0.019844 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23391 on 28401 degrees of freedom
## Residual deviance: 21967 on 28397 degrees of freedom
## AIC: 21977
##
## Number of Fisher Scoring iterations: 5
#Plot the ROC
ensemble_prob <- predict(glm_ensemble, combined_test, type = "prob")
ROC_ensemble <- roc(combined_test$default, ensemble_prob)
AUC_ensemble <- round(ROC_ensemble$auc*100, digits=2)
AUC_ensemble_text <- paste("Stacking Ensemble AUC=",AUC_ensemble,"%",sep="")
ggroc(list('Logistic'=ROC_lr, 'Stacking Ensemble' = ROC_ensemble)) + theme_gray()+
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
labs (
title = 'Lending Club Data',
subtitle = paste("LR AUC =",AUC_lr,"% | Stacking Ensemble AUC = ", AUC_ensemble,"%")
)